import pandas as pd
import numpy as np
import datetime
import plotly.express as px
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
%matplotlib inline
# data source - https://s3.amazonaws.com/baywheels-data/202108-baywheels-tripdata.csv.zip
df = pd.read_csv('./202108-baywheels-tripdata.csv')
df.sample(3)
| ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 155398 | 13AA64F1107992AE | classic_bike | 2021-08-15 08:42:11 | 2021-08-15 08:47:27 | Valencia St at Clinton Park | SF-L22 | Bryant St at 15th St | SF-M25 | 37.769010 | -122.422248 | 37.767100 | -122.410662 | casual |
| 59988 | B33DAA0B586F87A7 | electric_bike | 2021-08-01 13:51:24 | 2021-08-01 14:07:01 | Illinois St at 20th St | SF-O30-2 | Mission Dolores Park | SF-O21 | 37.760394 | -122.387464 | 37.761375 | -122.426565 | member |
| 128954 | 32127AE4E4E6B061 | electric_bike | 2021-08-09 12:13:44 | 2021-08-09 12:18:05 | NaN | NaN | Berry St at 4th St | SF-K29-1 | 37.770000 | -122.400000 | 37.775881 | -122.393115 | member |
df.describe()
| start_lat | start_lng | end_lat | end_lng | |
|---|---|---|---|---|
| count | 207023.000000 | 207023.000000 | 206747.000000 | 206747.000000 |
| mean | 37.749444 | -122.371818 | 37.749550 | -122.371639 |
| std | 0.116645 | 0.139639 | 0.116624 | 0.139574 |
| min | 37.280000 | -122.511282 | 37.240000 | -122.520000 |
| 25% | 37.764285 | -122.426964 | 37.764277 | -122.426630 |
| 50% | 37.776501 | -122.411306 | 37.776533 | -122.410887 |
| 75% | 37.789620 | -122.394625 | 37.790000 | -122.394586 |
| max | 37.880222 | -121.810000 | 37.900000 | -121.790000 |
df.shape
(207023, 13)
Below summary info of the dataset show that there are 207023 rows/records with 13 columns/features.
Summary table also shows that the number of non-null value count for each features and corresponding data types of features. Hence, it can be observed that there are null values in 6 columns/features (start_station_name, start_station_id, end_station_name, end_station_id, end_lat, end_lng).
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 207023 entries, 0 to 207022 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ride_id 207023 non-null object 1 rideable_type 207023 non-null object 2 started_at 207023 non-null object 3 ended_at 207023 non-null object 4 start_station_name 166811 non-null object 5 start_station_id 166811 non-null object 6 end_station_name 162712 non-null object 7 end_station_id 162712 non-null object 8 start_lat 207023 non-null float64 9 start_lng 207023 non-null float64 10 end_lat 206747 non-null float64 11 end_lng 206747 non-null float64 12 member_casual 207023 non-null object dtypes: float64(4), object(9) memory usage: 20.5+ MB
# check the total number of null values in each feature.
df.isnull().sum()
ride_id 0 rideable_type 0 started_at 0 ended_at 0 start_station_name 40212 start_station_id 40212 end_station_name 44311 end_station_id 44311 start_lat 0 start_lng 0 end_lat 276 end_lng 276 member_casual 0 dtype: int64
We could have discarded rows/records with missing values straight away. By doing so, we will lose over 19% of records.
Before discarding these missing records, let's explore the possibility for data imputation. In other words, if we have the lat and lng records, by using dictionary, we could perhaps map these to station id and name.
First, let's check how many records are missing with start_station_name,start_station_id, end_station name and end_station_id?
There are 62900 records/rows with missing values for station id and name for both start and end trip.
Next, let's check IF above missing_station_data records have latitude and longitude details recorded. Below results show that all missing records with start station name and start station id records have latitude and longitude details recorded. However, 276 end_lat and end_lng are missing.
Perhaps station id and station name can be mapped with latitude and longitude if we create a dictionary with geo range?
missing_station_idx = set(df[df['start_station_id'].isna()
| df['end_station_id'].isna()
| df['start_station_name'].isna()
| df['end_station_name'].isna()
].index)
print(f'No. of records with missing station_id and station_name = {len(missing_station_idx)}')
print('No. of records with missing geo features')
print(df.loc[missing_station_idx,['start_lat','start_lng','end_lat','end_lng']].isna().sum())
No. of records with missing station_id and station_name = 62900 No. of records with missing geo features start_lat 0 start_lng 0 end_lat 276 end_lng 276 dtype: int64
When looking into the values, the recorded latitude and longitude values only have two decimal point precision. Thus, it is not precise enough to map to station id and name. It SHOULD provide 6 digits (0.000001) of precision for decimal degrees lat/lon coordinates.
df.loc[missing_station_idx,['start_station_id','start_lat','start_lng','end_station_id','end_lat','end_lng']]
| start_station_id | start_lat | start_lng | end_station_id | end_lat | end_lng | |
|---|---|---|---|---|---|---|
| 192774 | SF-J29-1 | 37.778588 | -122.392553 | NaN | 37.76 | -122.40 |
| 107 | SF-G26 | 37.785796 | -122.408865 | NaN | 37.78 | -122.41 |
| 108 | SF-G26 | 37.785766 | -122.408601 | NaN | 37.79 | -122.41 |
| 109 | SJ-Q10 | 37.320994 | -121.876355 | NaN | 37.33 | -121.90 |
| 194859 | NaN | 37.780000 | -122.390000 | NaN | 37.78 | -122.39 |
| ... | ... | ... | ... | ... | ... | ... |
| 192761 | SF-J29-1 | 37.778527 | -122.392451 | NaN | 37.79 | -122.39 |
| 192762 | SF-H27-1 | 37.784584 | -122.400879 | NaN | 37.79 | -122.40 |
| 194856 | NaN | 37.330000 | -121.890000 | NaN | 37.34 | -121.90 |
| 194857 | NaN | 37.740000 | -122.480000 | NaN | 37.73 | -122.45 |
| 194858 | NaN | 37.770000 | -122.400000 | NaN | 37.77 | -122.40 |
62900 rows × 6 columns
# dicard records/rows which contain missing values and save it as a trip_df dataframe
trip_df = df.drop(missing_station_idx)
trip_df.shape
(144123, 13)
# trip_df now have non-null values
trip_df.isna().sum()
ride_id 0 rideable_type 0 started_at 0 ended_at 0 start_station_name 0 start_station_id 0 end_station_name 0 end_station_id 0 start_lat 0 start_lng 0 end_lat 0 end_lng 0 member_casual 0 dtype: int64
fig = px.scatter_mapbox(trip_df, lat="start_lat", lon="start_lng", color="rideable_type",hover_name="start_station_name")
fig.update_layout(mapbox_style="open-street-map")
fig.show()
fig = px.scatter_mapbox(trip_df, lat="end_lat", lon="end_lng", color="member_casual")
fig.update_layout(mapbox_style="open-street-map")
fig.show()
From started_at and ended_at features, calculate trip duration_in_seconds = ended_at - started_at.
Add a new column 'duration_in_seconds' values to identify longest/shortest/average trip duration
Note: second unit is chosen, if needed, we can always convert from seconds to minutes/hours/days etc.
# first convert from string type to timestamp type
trip_df['started_at'] = trip_df['started_at'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
trip_df['ended_at'] = trip_df['ended_at'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
# calculate duration in seconds
trip_df['duration_in_seconds'] = (trip_df['ended_at']-trip_df['started_at']).dt.total_seconds().astype(int)
# Top 5 longest trip?
# sort the trip by duration in descending order (longest trip first)
trip_df[['ride_id','duration_in_seconds']].sort_values(by = 'duration_in_seconds', ascending = False)[:5]
| ride_id | duration_in_seconds | |
|---|---|---|
| 61055 | 2DD2E2025DA3A9B5 | 822692 |
| 32377 | B69C1ABB1DF57BC0 | 712734 |
| 124835 | 69FCEC6AB37246C6 | 703037 |
| 55700 | 0BE46A431ADEF4A0 | 531467 |
| 122144 | 7D53D5242AD20324 | 528058 |
# summary statistics of the duration feature
print(trip_df['duration_in_seconds'].describe())
count 144123.000000 mean 985.872720 std 5320.251806 min -2.000000 25% 385.000000 50% 639.000000 75% 1022.000000 max 822692.000000 Name: duration_in_seconds, dtype: float64
From above statistics, we could see that there are negative values in trip duration. Below result shows the ended_at timestamp is earlier than started_at timestamp
print('No. of records with trip end time earlier than start time = ',
trip_df[trip_df['duration_in_seconds']<0].shape[0])
trip_df[trip_df['duration_in_seconds']<0]
No. of records with trip end time earlier than start time = 3
| ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | duration_in_seconds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7043 | 924385CCDC8AF804 | electric_bike | 2021-08-28 11:04:33 | 2021-08-28 11:04:31 | The Embarcadero at Sansome St | SF-A27 | The Embarcadero at Sansome St | SF-A27 | 37.804783 | -122.403359 | 37.804806 | -122.403362 | member | -2 |
| 17607 | 22ECA866EE266446 | classic_bike | 2021-08-31 21:00:01 | 2021-08-31 21:00:00 | 7th Ave at Clement St | SF-G12 | 7th Ave at Clement St | SF-G12 | 37.782439 | -122.465377 | 37.782439 | -122.465377 | member | -1 |
| 148962 | 9F43201C50AC9784 | classic_bike | 2021-08-13 19:50:42 | 2021-08-13 19:50:41 | Howard St at 8th St | SF-J25 | Howard St at 8th St | SF-J25 | 37.776513 | -122.411306 | 37.776513 | -122.411306 | member | -1 |
# discard these records with negaive duration
err_idx = trip_df[trip_df['duration_in_seconds']<0].index
trip_df = trip_df.drop(err_idx,axis='index')
trip_df.shape
(144120, 14)
How many trips with ZERO second duration? Trip end time and start time are same - duration is less than a second and trip start id and trip end id are same - extremely short trip is made which doesn't have impact to geo location. Below results show that there are records with zero trip length. These records are discarded.
zero_length = trip_df[(trip_df['duration_in_seconds']==0)
& (trip_df['start_station_id'] == trip_df['end_station_id'])]
print('No. of records with ZERO trip duration = ', zero_length.shape[0])
# example of trips with zero length
zero_length.head(3)
No. of records with ZERO trip duration = 14
| ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | duration_in_seconds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2949 | 5D17AE4971878762 | electric_bike | 2021-08-02 21:35:20 | 2021-08-02 21:35:20 | 22nd St Caltrain Station | SF-P30 | 22nd St Caltrain Station | SF-P30 | 37.757678 | -122.391961 | 37.757654 | -122.391940 | casual | 0 |
| 11672 | 7947DFCA39279AF5 | classic_bike | 2021-08-15 13:58:55 | 2021-08-15 13:58:55 | S Park St at 3rd St | SF-I29-2 | S Park St at 3rd St | SF-I29-2 | 37.780760 | -122.394989 | 37.780760 | -122.394989 | member | 0 |
| 17667 | 98F57167E5EFCC3D | electric_bike | 2021-08-13 10:43:37 | 2021-08-13 10:43:37 | Ryland Park | SJ-K9 | Ryland Park | SJ-K9 | 37.342758 | -121.895668 | 37.342768 | -121.895672 | member | 0 |
trip_df.shape
(144120, 14)
trip_df = trip_df[(trip_df['duration_in_seconds']!=0 & (trip_df['start_station_id'] != trip_df['end_station_id']))]
trip_df.shape
(144106, 14)
From below Figure, it can be observed that most bike trips are less than 20 minutes and the largest number of trips were in 6-8 minute range.
pc_freq = trip_df[['duration_in_seconds']].copy()
pc_freq['duration_in_minutes'] = pc_freq['duration_in_seconds']//60
pc_freq = pc_freq.duration_in_minutes.value_counts(normalize=True).reset_index().rename(columns={'index':'duration_in_minutes',
'duration_in_minutes':'Percent'})
pc_freq['Percent'] = round((pc_freq['Percent']*100),2)
pc_freq
| duration_in_minutes | Percent | |
|---|---|---|
| 0 | 7 | 6.16 |
| 1 | 6 | 6.10 |
| 2 | 8 | 5.98 |
| 3 | 5 | 5.92 |
| 4 | 9 | 5.77 |
| ... | ... | ... |
| 443 | 1471 | 0.00 |
| 444 | 448 | 0.00 |
| 445 | 1344 | 0.00 |
| 446 | 11717 | 0.00 |
| 447 | 5501 | 0.00 |
448 rows × 2 columns
fig = px.histogram(pc_freq,
x='duration_in_minutes',
y ='Percent',
height=500,
width = 700,
template='simple_white',
)
fig.update_traces(xbins=dict(start=0,end=120,size=2))
fig.update_layout(
title = 'Trip Frequency Distribution', bargap=0.03, title_x=0.5,
xaxis_title="Trip Duration (Minutes)", yaxis_title="Percent (%)",
)
fig.show()
Calculate average and median duration of bike rides in minutes.
trip_duration = trip_df[['duration_in_seconds']].copy()
trip_duration['duration_in_minutes'] = trip_duration.duration_in_seconds//60
print(f'average trip duration (minutes) = {int(trip_duration.duration_in_minutes.mean())}')
print(f'median trip duration (minutes) = {int(trip_duration.duration_in_minutes.median())}')
average trip duration (minutes) = 15 median trip duration (minutes) = 10
trip_duration.median()
duration_in_seconds 639.0 duration_in_minutes 10.0 dtype: float64
Calculate average and median duration of bike rides in minutes.
pc_freq.describe()
| duration_in_minutes | Percent | |
|---|---|---|
| count | 448.00000 | 448.000000 |
| mean | 627.71875 | 0.222366 |
| std | 1439.42303 | 0.909474 |
| min | 0.00000 | 0.000000 |
| 25% | 111.75000 | 0.000000 |
| 50% | 228.50000 | 0.000000 |
| 75% | 590.50000 | 0.010000 |
| max | 13711.00000 | 6.160000 |
there are three types of bike ('electric','classic','docked') - most popular type being electric bike
trip_df.rideable_type.unique()
array(['electric_bike', 'classic_bike', 'docked_bike'], dtype=object)
trip_df.rideable_type.value_counts()
electric_bike 76144 classic_bike 65968 docked_bike 1994 Name: rideable_type, dtype: int64
Below results show that average trip duration of docked bike are longest and electric bike being shortest.
trip_df.groupby(['rideable_type']).agg({'duration_in_seconds':['mean','min','max']})
| duration_in_seconds | |||
|---|---|---|---|
| mean | min | max | |
| rideable_type | |||
| classic_bike | 993.948809 | 1 | 88302 |
| docked_bike | 6774.361083 | 1 | 822692 |
| electric_bike | 827.511649 | 1 | 25509 |
t = trip_df[['rideable_type','member_casual','duration_in_seconds']].copy()
t['duration_in_minutes'] = t['duration_in_seconds']//60
t[:5]
| rideable_type | member_casual | duration_in_seconds | duration_in_minutes | |
|---|---|---|---|---|
| 0 | electric_bike | member | 2326 | 38 |
| 1 | classic_bike | member | 2 | 0 |
| 2 | electric_bike | member | 2676 | 44 |
| 3 | electric_bike | member | 602 | 10 |
| 4 | electric_bike | member | 496 | 8 |
fig = px.histogram(t,
x = 'duration_in_minutes',
facet_col='rideable_type',
color ='member_casual',
height=500,
template='simple_white',
)
fig.update_traces(xbins=dict(start=0,end=100,size=2))
fig.for_each_annotation(lambda a: a.update(text = a.text.replace('rideable_type=',"")))
#fig.for_each_trace(lambda b: b.update(name=b.name.replace('','')))
fig.update_layout(
title = 'Trip Frequency distribution by rideable and member type', bargap=0.03, title_x=0.5,
yaxis_title="Count",
)
fig.update_xaxes(title_text='duration (minutes)') # edit the xaxis label
fig.show()
there are two type of members ('member','casual') - most popular being casual
trip_df.member_casual.unique()
array(['member', 'casual'], dtype=object)
trip_df.member_casual.value_counts()
casual 81057 member 63049 Name: member_casual, dtype: int64
Below results show that average trip duration of casual type is longer than the member type.
trip_df.groupby(['member_casual']).agg({'duration_in_seconds':['mean','min','max']})
| duration_in_seconds | |||
|---|---|---|---|
| mean | min | max | |
| member_casual | |||
| casual | 1207.330928 | 1 | 822692 |
| member | 701.427699 | 1 | 70798 |
Below figure show the percentage of trips made by bike rideable type and user member types. It shows that electric bikes are most popular type, followed by classic bike, and docked bike are least popular type. It also shows that there are more trips made by casual type than member type.
b_gp = t[['rideable_type','member_casual']].value_counts(normalize=True).rename('percent').reset_index()
b_gp['percent'] = round((b_gp['percent']*100),2)
fig = px.bar(b_gp,
x="rideable_type",
y='percent',
width = 500,
height=500,
template='simple_white',
text = 'percent',
color='member_casual',
color_discrete_map={'member': 'gold','casual': 'salmon'}
)
fig.update_layout(title='Percent of trip made by bike type by member type',title_x=0.5,uniformtext_minsize=6)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.show()
From started_at and ended_at features, extract start/end day of the week. Add 'start_day_of_week' and 'end_day_of_week' columns with Monday=0,..., Friday = 4, Saturday = 5, Sunday=6. We could use this feature to identify which day (Mon-Sun) the trips were started/ended.
trip_df['start_day_of_week'] = trip_df.started_at.dt.dayofweek
trip_df['end_day_of_week'] = trip_df.ended_at.dt.dayofweek
trip_df.loc[194:196]
| ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | duration_in_seconds | start_day_of_week | end_day_of_week | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 194 | E49059431A161F90 | docked_bike | 2021-08-19 18:25:44 | 2021-08-21 09:24:31 | Harmon St at Adeline St | BK-I6 | Hearst Ave at Euclid Ave | BK-C9 | 37.849735 | -122.270582 | 37.875321 | -122.260155 | casual | 140327 | 3 | 5 |
| 195 | 8EE136571E04B40B | electric_bike | 2021-08-12 20:30:18 | 2021-08-12 20:43:29 | Carl St at Cole St | SF-M15 | Golden Gate Ave at Franklin St | SF-I22 | 37.765927 | -122.449260 | 37.780800 | -122.421957 | member | 791 | 3 | 3 |
| 196 | D2C538A762DCC3E7 | electric_bike | 2021-08-09 12:11:51 | 2021-08-09 12:18:47 | Jones St at Post St | SF-G25 | McCoppin St at Valencia St | SF-K22-2 | 37.787197 | -122.413348 | 37.771717 | -122.422031 | member | 416 | 0 | 0 |
From started_at and ended_at datetime stamp, we can extract start date and end date. We need this feature to easily check if a trip is made in the same day or multiple days. For example, a trip may have same start_day_of_week and end_day_of_week, but it doesn't mean it is same day trip, it may be that it is 7 days trip.
trip_df['start_date'] =trip_df.started_at.dt.date
trip_df['end_date'] = trip_df.ended_at.dt.date
Below figure shows that the highest number of trips are made on 27th Aug 2021 while lowest number of trips are made on 23rd Aug 2021.
tmp = trip_df[['start_date','start_day_of_week','duration_in_seconds','rideable_type','member_casual']].copy()
t1 = tmp.start_date.value_counts().rename('count').reset_index().rename(columns={'index':'start_date'})
t1 = t1.sort_values(by='start_date')
fig = px.line(t1,
x = 'start_date',
y= 'count',
template='simple_white',
height=400,
width = 600
)
fig.update_layout(title = 'Daily number of Trip',
title_x=0.5, yaxis_title="No. of Trip", xaxis_title='Date'
)
fig.show()
Below Figures show the break down of the number of trip made by bike and member type. It shows that there are more trips made e-bike member users than classic member users.
t2 = tmp[['start_date','rideable_type','member_casual']].value_counts().rename('count').reset_index()
t2 = t2.sort_values(by=['start_date'])
import plotly.graph_objects as go
fig = px.line(t2,
x="start_date",
y="count",
facet_col="rideable_type",
color="member_casual",
color_discrete_map={ 'member': 'gold','casual': 'salmon'},
template='simple_white',
facet_col_spacing=0.04, # space between facet column
)
fig.for_each_annotation(lambda a: a.update(text = a.text.replace('rideable_type=',"")))
fig.update_layout(title = 'No. of Trip made by rideable and member types',
title_x=0.5, yaxis_title="No. of Trip"
)
# hide subplot x-axis titles
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
Below figures show the break down of the number of day of a week trip made by bike and member type. It shows that there is an upward trend for weekends trips made by casual members whereas a downward trend for the weekends trips made by member users.
t3 = tmp.groupby(['start_day_of_week','rideable_type','member_casual'])['start_date'].count().rename('count').reset_index()
fig = px.line(t3,
x="start_day_of_week",
y="count",
facet_col="rideable_type",
color="member_casual",
color_discrete_map={ 'member': 'gold','casual': 'salmon'},
template='simple_white',
height = 400,
facet_col_spacing=0.04 # space between facet column
)
fig.for_each_annotation(lambda a: a.update(text = a.text.replace('rideable_type=',"")))
fig.update_xaxes(title_text='Day of Week') # edit the xaxis label
fig.update_layout(title = 'Total No. of Trip made by rideable and member types',
title_x=0.5, yaxis_title="Count"
)
fig.show()
How many % of trips are same day trips and how many % of trips are multiple day trip? (how many trips are longer than one day?)
Results show that most trips are made in same days and less than 0.4% of trips are longer than one day. In August, the largest number of trips are made on 27th August 2021.
sameday_trip = trip_df[trip_df['start_date'] == trip_df['end_date']]
multiday_trip = trip_df[trip_df['end_date'] > trip_df['start_date']]
sameday_n = sameday_trip.shape[0]
multiday_n = multiday_trip.shape[0]
total = trip_df.shape[0]
print(f'No.of same day trip = {sameday_n} ({round((sameday_n/total*100),2)}%)')
print(f'No.of multi day trip = {multiday_n} ({round((multiday_n/total*100),2)}%)')
No.of same day trip = 143538 (99.61%) No.of multi day trip = 568 (0.39%)
First create a utiltiy function(isweekday) that return 1 if it is a weekday, otherwise return 0. Apply this function to day of the week columns to identify if it is weekday or weekend. Then count the number of bike ride groupedby isweekday and rideable and member type features. Calculate the percentage of trips made by user membership type in weekdays and percentage of trips made by user membership type in weekend.
def isweekday(ds):
if ds < 5:
return 1
return 0
# count the number of bike ride from same day trip
a = sameday_trip[['start_date','start_day_of_week','rideable_type','member_casual']].copy()
a = a.set_axis(['date','dayofweek','rideable_type','member type'],axis=1)
# count bike ride from multiple day trip (assuming that bike ride definitely happen on start date and end date.
# *** Note that it excludes the in-between date)
b = multiday_trip[['start_date','start_day_of_week','rideable_type','member_casual']].copy()
b = b.set_axis(['date','dayofweek','rideable_type','member type'],axis=1)
c = multiday_trip[['end_date','end_day_of_week','rideable_type','member_casual']].copy()
c = c.set_axis(['date','dayofweek','rideable_type','member type'],axis=1)
nride_gp = pd.concat([a,b,c],axis=0)
nride_gp['isweekday'] = nride_gp.dayofweek.apply(isweekday)
nride_gp.head(5)
gp1 = nride_gp[['isweekday','rideable_type','member type']].value_counts(normalize=True).rename('percent').reset_index()
gp1['percent'] = round((gp1['percent']*100),2)
gp1
| isweekday | rideable_type | member type | percent | |
|---|---|---|---|---|
| 0 | 1 | electric_bike | casual | 20.70 |
| 1 | 1 | electric_bike | member | 17.28 |
| 2 | 1 | classic_bike | member | 15.92 |
| 3 | 1 | classic_bike | casual | 15.49 |
| 4 | 0 | electric_bike | casual | 9.61 |
| 5 | 0 | classic_bike | casual | 9.11 |
| 6 | 0 | classic_bike | member | 5.25 |
| 7 | 0 | electric_bike | member | 5.21 |
| 8 | 1 | docked_bike | casual | 0.82 |
| 9 | 0 | docked_bike | casual | 0.60 |
below figures show the break down of weekday vs weekend trip by bike and member type
fig = px.bar(gp1,
x = 'isweekday',
y ='percent',
facet_col = 'rideable_type',
text ='percent',
color = 'member type',
color_discrete_map={ 'member': 'gold','casual': 'salmon'},
template = 'simple_white',
)
fig.for_each_annotation(lambda x: x.update(text = x.text.replace('rideable_type=','')))
fig.update_layout(title = 'Percent of rides on weekdays vs weekend by bike type', title_x=0.5,uniformtext_minsize=6)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(
xaxis1 = dict(
tickmode = 'array',
tickvals = [0, 1],
ticktext = ['weekend', 'weekday']
),
xaxis2 = dict(
tickmode = 'array',
tickvals = [0, 1],
ticktext = ['weekend', 'weekday']
),
xaxis3 = dict(
tickmode = 'array',
tickvals = [0, 1],
ticktext = ['weekend', 'weekday']
)
)
# hide subplot x-axis titles
for axis in fig.layout:
if type(fig.layout[axis])==go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
trip_df.head(3)
| ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | duration_in_seconds | start_day_of_week | end_day_of_week | start_date | end_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | A67CB6CC130B48AB | electric_bike | 2021-08-07 13:30:11 | 2021-08-07 14:08:57 | S Van Ness Ave at Market St | SF-J23-2 | The Embarcadero at Bryant St | SF-G30-2 | 37.774554 | -122.419896 | 37.787410 | -122.388207 | member | 2326 | 5 | 5 | 2021-08-07 | 2021-08-07 |
| 1 | EA6D1C08FB8D1751 | classic_bike | 2021-08-16 18:34:12 | 2021-08-16 18:34:14 | 7th Ave at Cabrillo St | SF-J12 | 7th Ave at Cabrillo St | SF-J12 | 37.775120 | -122.464998 | 37.775120 | -122.464998 | member | 2 | 0 | 0 | 2021-08-16 | 2021-08-16 |
| 2 | 91E70C07BFA0BAED | electric_bike | 2021-08-31 18:28:04 | 2021-08-31 19:12:40 | 7th Ave at Cabrillo St | SF-J12 | 7th Ave at Cabrillo St | SF-J12 | 37.775129 | -122.464995 | 37.775115 | -122.465001 | member | 2676 | 1 | 1 | 2021-08-31 | 2021-08-31 |
Below map shows the 5 electric bike path with longest trip duration.
map_df = trip_df[['rideable_type','start_lat','start_lng','start_station_id','start_station_name',
'end_lat','end_lng','end_station_id','end_station_name','duration_in_seconds']].copy()
map_df = map_df[map_df.rideable_type=='electric_bike']
map_df.sort_values(by='duration_in_seconds',inplace=True,ascending=False)
map_df = map_df[:5]
map_df = map_df.reset_index(drop=True)
map_df
| rideable_type | start_lat | start_lng | start_station_id | start_station_name | end_lat | end_lng | end_station_id | end_station_name | duration_in_seconds | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | electric_bike | 37.773453 | -122.415932 | SF-K24 | 11th St at Natoma St | 37.750387 | -122.390409 | SF-S29 | Indiana St at Cesar Chavez St | 25509 |
| 1 | electric_bike | 37.787340 | -122.388067 | SF-G30-2 | The Embarcadero at Bryant St | 37.779973 | -122.480475 | SF-G8 | 21st Ave at Geary Blvd | 25275 |
| 2 | electric_bike | 37.776564 | -122.408045 | SF-J26-2 | Folsom St at 7th St | 37.777730 | -122.406362 | SF-J26 | Victoria Manalo Draves Park | 24369 |
| 3 | electric_bike | 37.788625 | -122.420543 | SF-F23 | Bush St at Polk St | 37.782370 | -122.465573 | SF-G12 | 7th Ave at Clement St | 20239 |
| 4 | electric_bike | 37.756859 | -122.405949 | SF-P26 | 22nd St at Potrero Ave | 37.785879 | -122.408827 | SF-G26 | Cyril Magnin St at Ellis St | 17605 |
lat_lst = pd.concat([map_df.start_lat,map_df.end_lat],axis=0).to_list()
lng_lst = pd.concat([map_df.start_lng,map_df.end_lng],axis=0).to_list()
lat_path=[]
lng_path=[]
for i in range(len(map_df.start_lat)):
lat_path.append([lat_lst[i],lat_lst[i+5]])
lng_path.append([lng_lst[i],lng_lst[i+5]])
lat_min1 = map_df.start_lat.min()
lat_min2 = map_df.end_lat.min()
lng_min1 = map_df.start_lng.min()
lng_min2 = map_df.end_lng.min()
fig = go.Figure(go.Scattermapbox(
lat= list(map_df.start_lat),
lon= list(map_df.start_lng),
mode='markers',
marker=dict(size=10,color='green')
))
for i in range(len(map_df)):
fig.add_trace(go.Scattermapbox(
mode='markers+lines',
lat= lat_path[i],
lon = lng_path[i],
line = dict(width = 2,color = 'red'),
#marker = {'size':10}
)
)
fig.update_layout(
mapbox = {'zoom':10,
'center':{'lon':lng_min1 , 'lat':lat_min1},
#'style': 'stamen-terrain',
'style': "open-street-map",
'center':{'lon':lng_min2 , 'lat':lat_min2}
},
showlegend = False,
)
fig.show()
There are 465 unique start station and 466 unique end station. The top 5 popular start stations and end stations are the same.
print(f'No. of unique start station = {trip_df.start_station_name.nunique()}')
print(f'No. of unique end station = {trip_df.end_station_name.nunique()}')
No. of unique start station = 465 No. of unique end station = 466
top5_start = trip_df.groupby(['start_station_name'])['ride_id'].count().sort_values(ascending=False)[:5]
top5_end = trip_df.groupby(['end_station_name'])['ride_id'].count().sort_values(ascending=False)[:5]
print('Top 5 popular start station: ', top5_start)
print('Top 5 popular end station: ', top5_end)
Top 5 popular start station: start_station_name Market St at 10th St 2134 Powell St BART Station (Market St at 4th St) 1840 Powell St BART Station (Market St at 5th St) 1467 Market St at Steuart St 1383 17th St at Valencia St 1279 Name: ride_id, dtype: int64 Top 5 popular end station: end_station_name Market St at 10th St 2147 Powell St BART Station (Market St at 4th St) 1911 Powell St BART Station (Market St at 5th St) 1546 Market St at Steuart St 1512 17th St at Valencia St 1434 Name: ride_id, dtype: int64
top5 = list(top5_start.index)
top5
['Market St at 10th St', 'Powell St BART Station (Market St at 4th St)', 'Powell St BART Station (Market St at 5th St)', 'Market St at Steuart St', '17th St at Valencia St']
lat_lst = []
lng_lst =[]
for station in top5:
lat_lst.append(trip_df[trip_df.start_station_name==station]['start_lat'].mean())
lng_lst.append(trip_df[trip_df.start_station_name==station]['start_lng'].mean())
Add Map marker for top5 station
map = folium.Map(location = [lat_lst[0],lng_lst[0]] , zoom_start=12)
for i in range(len(lat_lst)):
marker = folium.Marker(location = [lat_lst[i],lng_lst[i]])
marker.add_to(map)
map
- Market St at 10th St
- Powell St BART Station (Market St at 4th St)
- Powell St BART Station (Market St at 5th St)
- Market St at Steuart St
- 17th St at Valencia St